import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from scipy.stats import norm
from scipy import stats
import statsmodels.api as sm
df = pd.read_excel (r'/Users/reevaandipara/Assignments/MA541_Course_Project_Data.xlsx')
print("Close_ETF: mean=%.3f stdv=%.3f" % (df['Close_ETF'].mean(),df['Close_ETF'].std()))
print("oil: mean=%.3f stdv=%.3f" % (df['oil'].mean(),df['oil'].std()))
print("gold: mean=%.3f stdv=%.3f" % (df['gold'].mean(),df['gold'].std()))
print("JPM: mean=%.3f stdv=%.3f" % (df['JPM'].mean(),df['JPM'].std()))
Close_ETF: mean=121.153 stdv=12.570 oil: mean=0.001 stdv=0.021 gold: mean=0.001 stdv=0.011 JPM: mean=0.001 stdv=0.011
# Correlation between two variables
print("Close_ETF-oil: ", df['Close_ETF'].corr(df['oil']))
print("Close_ETF-gold: ", df['Close_ETF'].corr(df['gold']))
print("Close_ETF-JPM: ", df['Close_ETF'].corr(df['JPM']))
print("oil-gold: ", df['oil'].corr(df['gold']))
print("oil-JPM: ", df['oil'].corr(df['JPM']))
print("gold-JPM: ", df['gold'].corr(df['JPM']))
Close_ETF-oil: -0.009044842009622077 Close_ETF-gold: 0.022995570076054603 Close_ETF-JPM: 0.03680705773259183 oil-gold: 0.2356503718414409 oil-JPM: -0.12084893009495935 gold-JPM: 0.10016984211388416
# Part-2
# 1) Histogram for each column
df.hist()
array([[<AxesSubplot:title={'center':'Close_ETF'}>,
<AxesSubplot:title={'center':'oil'}>],
[<AxesSubplot:title={'center':'gold'}>,
<AxesSubplot:title={'center':'JPM'}>]], dtype=object)
pip install plotly
Requirement already satisfied: plotly in /Users/reevaandipara/opt/anaconda3/lib/python3.8/site-packages (5.4.0) Requirement already satisfied: tenacity>=6.2.0 in /Users/reevaandipara/opt/anaconda3/lib/python3.8/site-packages (from plotly) (8.0.1) Requirement already satisfied: six in /Users/reevaandipara/opt/anaconda3/lib/python3.8/site-packages (from plotly) (1.15.0) Note: you may need to restart the kernel to use updated packages.
import plotly.express as px
a = df['Close_ETF']
b = df['oil']
c = df['gold']
d = df['JPM']
ts1 = px.line(a)
ts1.show()
ts = px.line(b)
ts.show()
ts5 = px.line(c)
ts5.show()
ts3 = px.line(d)
ts3.show()
x = (df-df.mean())/df.std()
ts4 = px.line(x)
ts4.show()
plt.scatter(a, b)
plt.show()
plt.scatter(a, c, Color='green')
plt.show()
plt.scatter(a, d, Color='red')
plt.show()
<ipython-input-13-3e526fe5dde8>:3: MatplotlibDeprecationWarning: Case-insensitive properties were deprecated in 3.3 and support will be removed two minor releases later
<ipython-input-13-3e526fe5dde8>:5: MatplotlibDeprecationWarning: Case-insensitive properties were deprecated in 3.3 and support will be removed two minor releases later
from scipy.stats import chisquare
stat, p_value = chisquare(np.array(df['Close_ETF']))
if(p_value>0.1):
print("Normal")
else:
print("Reject H0")
Reject H0
stat, p_value = chisquare(np.array(df['oil']))
if(p_value>0.1):
print("Normal")
else:
print("Reject H0")
Normal
stat, p_value = chisquare(np.array(df['gold']))
if(p_value>0.1):
print("Normal")
else:
print("Reject H0")
Normal
stat, p_value = chisquare(np.array(df['JPM']))
if(p_value>0.1):
print("Normal")
else:
print("Reject H0")
Normal
import statistics as st
p_mean = st.mean(df['Close_ETF'])
print("The mean of the population is %s" %p_mean)
p_stdev = st.stdev(df['Close_ETF'])
print("The standard deviation of the population is %s" %p_stdev)
The mean of the population is 121.152960012 The standard deviation of the population is 12.569790313110744
data = df['Close_ETF'].tolist()
n = 20
sample=[]
for i in range(0, len(df),n):
group = data[i:i+20]
sample.append(group)
for i in range (0, 50):
print("Sample %s is : \n %s" %(i+1 ,sample[i]))
Sample 1 is : [97.349998, 97.75, 99.160004, 99.650002, 99.260002, 98.25, 99.25, 100.300003, 100.610001, 99.559998, 101.660004, 101.660004, 101.57, 100.019997, 99.440002, 98.419998, 98.519997, 97.529999, 98.800003, 97.660004] Sample 2 is : [97.629997, 98.529999, 99.769997, 98.739998, 100.699997, 101.150002, 100.580002, 99.300003, 100.239998, 100.730003, 100.510002, 99.919998, 98.5, 99.510002, 98.279999, 99.169998, 99.239998, 98.489998, 100.230003, 99.860001] Sample 3 is : [99.400002, 99.160004, 99.389999, 98.510002, 98.510002, 96.419998, 96.980003, 98.0, 98.279999, 98.650002, 99.550003, 99.040001, 99.309998, 99.620003, 100.480003, 100.860001, 100.449997, 100.769997, 99.769997, 99.93] Sample 4 is : [100.110001, 100.139999, 100.760002, 101.440002, 102.800003, 103.360001, 103.410004, 102.830002, 103.68, 103.0, 101.959999, 102.260002, 102.449997, 102.089996, 103.580002, 103.379997, 104.599998, 103.669998, 102.550003, 102.940002] Sample 5 is : [101.110001, 100.279999, 99.949997, 100.93, 99.949997, 102.080002, 102.449997, 103.389999, 103.860001, 104.260002, 104.0, 104.279999, 104.57, 104.900002, 105.269997, 104.989998, 105.410004, 104.260002, 105.040001, 104.860001] Sample 6 is : [103.540001, 103.349998, 103.580002, 103.629997, 105.040001, 105.18, 105.400002, 105.300003, 105.989998, 105.760002, 105.839996, 106.400002, 105.610001, 105.18, 105.150002, 106.330002, 106.360001, 105.459999, 104.93, 103.839996] Sample 7 is : [104.720001, 103.779999, 104.209999, 105.589996, 105.989998, 106.370003, 106.449997, 107.599998, 107.330002, 107.160004, 107.599998, 106.849998, 107.57, 106.739998, 106.730003, 107.93, 108.139999, 107.599998, 108.160004, 108.5] Sample 8 is : [109.720001, 108.900002, 109.660004, 109.730003, 109.620003, 109.699997, 111.160004, 111.18, 111.279999, 111.230003, 112.440002, 112.550003, 112.93, 113.379997, 112.389999, 113.220001, 112.559998, 113.5, 113.779999, 114.230003] Sample 9 is : [114.199997, 115.099998, 114.800003, 114.43, 115.870003, 114.68, 113.370003, 113.480003, 113.480003, 113.970001, 113.779999, 112.849998, 113.18, 114.449997, 114.480003, 114.849998, 116.07, 115.650002, 115.129997, 116.169998] Sample 10 is : [115.660004, 115.230003, 114.879997, 114.589996, 114.389999, 114.870003, 114.940002, 115.019997, 116.160004, 115.480003, 115.690002, 115.989998, 116.379997, 114.959999, 114.5, 112.580002, 111.120003, 112.580002, 111.199997, 111.790001] Sample 11 is : [113.040001, 113.07, 111.059998, 109.650002, 109.459999, 109.550003, 111.0, 111.029999, 112.589996, 112.970001, 113.099998, 113.779999, 114.639999, 115.269997, 114.900002, 114.629997, 114.370003, 114.82, 113.209999, 113.389999] Sample 12 is : [112.959999, 113.830002, 113.830002, 111.919998, 112.669998, 114.25, 114.360001, 114.199997, 114.300003, 112.82, 111.830002, 110.959999, 112.150002, 112.059998, 112.779999, 111.809998, 109.959999, 108.830002, 109.75, 110.449997] Sample 13 is : [109.989998, 110.040001, 109.099998, 109.650002, 109.269997, 109.620003, 109.809998, 110.269997, 111.849998, 112.239998, 112.870003, 112.860001, 112.709999, 113.129997, 112.089996, 112.980003, 114.699997, 114.860001, 113.790001, 114.349998] Sample 14 is : [113.220001, 114.019997, 114.0, 113.830002, 113.629997, 113.199997, 113.769997, 114.75, 114.389999, 113.839996, 113.449997, 113.919998, 114.529999, 112.940002, 112.879997, 111.889999, 112.220001, 111.440002, 111.730003, 111.779999] Sample 15 is : [111.860001, 111.519997, 110.800003, 110.709999, 110.239998, 111.639999, 109.580002, 109.879997, 108.959999, 108.75, 109.769997, 110.099998, 110.57, 110.839996, 111.07, 110.209999, 110.199997, 108.400002, 106.849998, 107.0] Sample 16 is : [108.379997, 108.160004, 106.980003, 107.190002, 108.300003, 108.910004, 110.029999, 109.709999, 110.480003, 110.199997, 110.349998, 111.099998, 111.099998, 111.449997, 110.529999, 111.110001, 111.32, 112.580002, 112.120003, 112.860001] Sample 17 is : [112.580002, 112.480003, 113.059998, 113.43, 113.660004, 112.800003, 113.139999, 113.150002, 112.470001, 112.959999, 111.550003, 110.949997, 111.459999, 110.75, 111.279999, 111.839996, 111.760002, 113.650002, 113.839996, 113.900002] Sample 18 is : [114.68, 113.449997, 112.510002, 112.970001, 112.529999, 111.540001, 110.639999, 111.260002, 111.68, 110.739998, 110.519997, 111.239998, 109.989998, 109.860001, 111.540001, 112.879997, 113.220001, 113.199997, 113.510002, 113.550003] Sample 19 is : [114.980003, 116.550003, 117.279999, 117.110001, 116.879997, 116.970001, 117.860001, 118.790001, 118.730003, 117.879997, 118.580002, 118.739998, 117.419998, 117.980003, 118.160004, 118.440002, 118.349998, 117.809998, 117.889999, 119.230003] Sample 20 is : [119.330002, 119.25, 119.209999, 118.099998, 118.790001, 119.209999, 119.330002, 120.370003, 120.790001, 120.879997, 120.809998, 119.440002, 119.470001, 120.389999, 120.68, 120.769997, 120.519997, 121.18, 121.360001, 121.129997] Sample 21 is : [120.870003, 120.300003, 118.830002, 118.010002, 118.610001, 118.440002, 119.0, 118.18, 118.57, 117.620003, 118.239998, 119.470001, 118.220001, 117.5, 116.779999, 116.550003, 116.879997, 117.230003, 117.43, 117.43] Sample 22 is : [118.160004, 118.910004, 119.269997, 118.959999, 120.230003, 120.07, 120.209999, 119.309998, 119.739998, 120.769997, 120.68, 121.129997, 121.209999, 120.230003, 120.389999, 118.599998, 119.449997, 120.239998, 121.43, 120.629997] Sample 23 is : [121.230003, 121.169998, 121.220001, 122.730003, 122.790001, 122.330002, 120.970001, 121.239998, 120.389999, 121.139999, 120.139999, 119.120003, 119.360001, 118.540001, 118.099998, 116.900002, 117.0, 117.139999, 117.309998, 116.529999] Sample 24 is : [118.18, 117.959999, 117.43, 117.629997, 118.190002, 118.599998, 119.239998, 118.0, 118.089996, 118.699997, 117.300003, 115.769997, 114.150002, 114.5, 115.410004, 113.800003, 116.010002, 115.57, 116.330002, 115.199997] Sample 25 is : [115.650002, 114.199997, 115.75, 116.400002, 116.599998, 117.5, 117.459999, 117.089996, 117.82, 116.599998, 117.239998, 115.949997, 115.720001, 116.800003, 117.580002, 118.790001, 119.290001, 119.120003, 119.779999, 119.5] Sample 26 is : [119.410004, 120.050003, 120.25, 119.480003, 120.5, 120.760002, 120.150002, 120.040001, 120.129997, 119.910004, 120.480003, 120.199997, 120.580002, 120.860001, 121.089996, 121.400002, 121.360001, 121.400002, 121.470001, 121.57] Sample 27 is : [119.860001, 118.980003, 119.150002, 120.150002, 119.830002, 119.18, 119.529999, 120.489998, 119.480003, 119.949997, 121.32, 121.940002, 122.260002, 122.43, 122.910004, 122.839996, 122.349998, 123.019997, 123.440002, 122.720001] Sample 28 is : [123.540001, 123.190002, 123.339996, 123.790001, 124.57, 123.739998, 123.650002, 124.389999, 124.720001, 123.720001, 122.879997, 122.650002, 123.389999, 123.330002, 123.82, 123.059998, 123.82, 122.209999, 122.199997, 122.190002] Sample 29 is : [122.470001, 122.470001, 122.239998, 121.150002, 121.589996, 120.760002, 121.690002, 121.050003, 122.580002, 122.489998, 122.269997, 123.699997, 123.910004, 123.5, 124.599998, 124.349998, 123.660004, 123.209999, 123.150002, 123.5] Sample 30 is : [123.089996, 122.050003, 120.910004, 121.339996, 121.440002, 121.580002, 121.550003, 121.669998, 122.660004, 123.040001, 122.599998, 121.220001, 119.629997, 119.199997, 119.610001, 118.599998, 118.43, 117.5, 117.43, 118.669998] Sample 31 is : [119.110001, 117.82, 119.779999, 117.669998, 118.129997, 119.959999, 119.720001, 119.370003, 118.099998, 119.800003, 120.129997, 120.489998, 121.75, 122.269997, 122.110001, 122.230003, 122.230003, 122.389999, 123.339996, 123.760002] Sample 32 is : [123.690002, 123.239998, 123.489998, 124.639999, 125.129997, 125.760002, 126.300003, 127.029999, 127.129997, 126.230003, 126.089996, 125.410004, 126.690002, 126.849998, 126.580002, 126.82, 126.080002, 126.0, 126.330002, 126.449997] Sample 33 is : [127.309998, 127.809998, 127.440002, 126.360001, 125.709999, 125.830002, 126.029999, 126.690002, 126.760002, 125.470001, 125.75, 125.190002, 124.510002, 126.699997, 127.300003, 127.379997, 128.440002, 128.770004, 128.899994, 129.309998] Sample 34 is : [128.800003, 128.679993, 128.330002, 127.82, 128.309998, 125.970001, 126.419998, 126.550003, 126.660004, 127.360001, 128.539993, 128.440002, 127.5, 128.389999, 126.900002, 126.269997, 126.599998, 125.480003, 126.620003, 126.410004] Sample 35 is : [126.639999, 126.410004, 127.75, 128.199997, 129.160004, 128.809998, 128.490005, 129.270004, 129.080002, 129.410004, 129.460007, 128.229996, 129.369995, 129.360001, 128.759995, 128.169998, 127.970001, 128.240005, 127.379997, 128.589996] Sample 36 is : [128.830002, 130.179993, 130.759995, 131.029999, 130.619995, 130.410004, 129.589996, 130.380005, 130.110001, 130.210007, 130.020004, 129.220001, 130.029999, 129.800003, 129.830002, 129.729996, 130.559998, 131.009995, 130.869995, 129.539993] Sample 37 is : [129.740005, 128.639999, 128.880005, 128.710007, 128.660004, 130.699997, 130.949997, 131.130005, 131.149994, 130.910004, 130.369995, 130.399994, 131.029999, 131.470001, 130.399994, 131.380005, 130.889999, 131.360001, 132.520004, 132.360001] Sample 38 is : [132.619995, 132.550003, 130.949997, 129.240005, 129.5, 129.309998, 126.849998, 126.209999, 127.099998, 126.129997, 125.169998, 126.169998, 127.730003, 128.380005, 126.099998, 127.510002, 128.729996, 129.0, 127.120003, 126.809998] Sample 39 is : [125.860001, 125.75, 125.349998, 123.989998, 122.550003, 123.5, 126.120003, 124.650002, 123.669998, 124.089996, 125.010002, 124.459999, 124.440002, 124.989998, 123.910004, 124.75, 127.269997, 127.279999, 127.800003, 127.07] Sample 40 is : [127.440002, 126.610001, 126.849998, 127.410004, 126.050003, 124.0, 123.519997, 123.339996, 123.970001, 125.690002, 124.830002, 123.949997, 126.209999, 126.660004, 126.830002, 126.709999, 127.980003, 127.849998, 127.220001, 128.080002] Sample 41 is : [128.419998, 128.199997, 127.900002, 127.410004, 126.980003, 127.370003, 127.010002, 127.110001, 128.630005, 129.699997, 130.029999, 130.690002, 130.130005, 130.119995, 129.759995, 129.649994, 129.809998, 130.429993, 130.580002, 130.660004] Sample 42 is : [130.639999, 131.419998, 131.669998, 130.509995, 129.910004, 130.279999, 130.410004, 131.690002, 132.220001, 132.229996, 131.960007, 132.139999, 131.809998, 132.509995, 131.869995, 131.470001, 132.479996, 133.580002, 133.740005, 133.690002] Sample 43 is : [133.580002, 133.080002, 133.360001, 134.919998, 135.179993, 135.009995, 135.089996, 135.270004, 135.110001, 136.279999, 136.630005, 136.839996, 136.410004, 136.589996, 136.809998, 136.839996, 137.470001, 137.880005, 138.350006, 138.779999] Sample 44 is : [137.910004, 137.809998, 137.789993, 136.860001, 136.779999, 136.539993, 138.080002, 138.610001, 138.910004, 138.179993, 138.240005, 138.580002, 139.619995, 140.020004, 140.380005, 140.419998, 140.5, 140.639999, 140.919998, 140.350006] Sample 45 is : [138.419998, 139.020004, 140.470001, 140.529999, 140.220001, 141.289993, 141.899994, 141.779999, 141.160004, 141.419998, 141.830002, 141.720001, 141.869995, 143.119995, 142.339996, 141.949997, 142.220001, 142.139999, 141.619995, 140.75] Sample 46 is : [141.580002, 142.509995, 142.210007, 141.619995, 141.369995, 141.669998, 140.539993, 141.190002, 141.070007, 141.539993, 142.160004, 143.240005, 142.960007, 143.020004, 142.539993, 142.820007, 142.380005, 142.800003, 143.949997, 142.259995] Sample 47 is : [142.130005, 142.050003, 142.789993, 143.75, 144.610001, 144.809998, 144.850006, 144.889999, 145.210007, 145.020004, 143.940002, 143.449997, 144.660004, 145.610001, 145.800003, 145.729996, 146.039993, 145.979996, 145.869995, 145.300003] Sample 48 is : [145.169998, 139.5, 140.929993, 140.509995, 138.669998, 137.350006, 139.699997, 139.559998, 140.740005, 140.779999, 140.990005, 138.25, 139.279999, 139.470001, 138.529999, 140.199997, 140.970001, 143.289993, 143.179993, 143.389999] Sample 49 is : [143.199997, 142.860001, 141.820007, 141.970001, 142.0, 142.160004, 143.690002, 143.850006, 144.240005, 144.440002, 144.610001, 144.020004, 144.660004, 145.320007, 146.699997, 147.089996, 147.270004, 147.229996, 148.619995, 148.059998] Sample 50 is : [148.119995, 149.479996, 149.649994, 149.529999, 148.289993, 148.669998, 149.539993, 150.350006, 150.919998, 150.949997, 150.75, 151.160004, 149.580002, 150.860001, 150.529999, 150.570007, 151.600006, 151.300003, 152.619995, 152.539993]
sample_mean = []
for i in range (0,50):
group1 = st.mean(sample[i])
sample_mean.append(group1)
# for i in range (0, 50):
# print("Mean of Sample %s is : \n %s" %(i+1 ,sample_mean[i]))
plt.hist(sample_mean,50)
plt.title("Histogram plot for Sample Means")
plt.xlabel("Samples")
plt.ylabel("Frequency")
plt.show()
sample_mean = []
for i in range (0, 50):
group1 = st.mean(sample[i])
sample_mean.append(group1)
total_sample_mean = st.mean(sample_mean)
total_sample_std_deviation = st.stdev(sample_mean)
print("The mean of the sample is %s" %total_sample_mean)
print("The standard deviation of the sample is %s" %total_sample_std_deviation)
The mean of the sample is 121.15296001200001 The standard deviation of the sample is 12.615972812491503
data = df['Close_ETF'].tolist()
n = 100
sample=[]
for i in range(0, len(df),n):
group = data[i:i+100]
sample.append(group)
# for i in range (0, 10):
# print("Sample %s is : \n %s" %(i+1 ,sample[i]))
sample_mean = []
for i in range (0, 10):
group1 = st.mean(sample[i])
sample_mean.append(group1)
for i in range (0, 10):
print("Mean of Sample %s is : \n %s" %(i+1 ,sample_mean[i]))
plt.hist(sample_mean,10)
plt.title("Histogram plot for Sample Means")
plt.xlabel("Samples")
plt.ylabel("Frequency")
plt.show()
Mean of Sample 1 is : 100.77430029 Mean of Sample 2 is : 110.48050028 Mean of Sample 3 is : 112.01809939 Mean of Sample 4 is : 114.51720014 Mean of Sample 5 is : 118.40030004 Mean of Sample 6 is : 121.6768003 Mean of Sample 7 is : 125.78560011 Mean of Sample 8 is : 128.01269998 Mean of Sample 9 is : 135.39209964 Mean of Sample 10 is : 144.47199995
total_sample_mean = st.mean(sample_mean)
total_sample_std_deviation = st.stdev(sample_mean)
print("The mean of the sample is %s" %total_sample_mean)
print("The standard deviation of the sample is %s" %total_sample_std_deviation)
The mean of the sample is 121.152960012 The standard deviation of the sample is 12.821725528306825
import random
df = pd.read_excel (r'/Users/reevaandipara/Assignments/MA541_Course_Project_Data.xlsx')
data = df['Close_ETF'].tolist()
sample=[]
for i in range(0, 50):
group = random.sample(data,20)
sample.append(group)
# for i in range (0, 50):
# print("Sample %s is : \n %s" %(i+1 ,sample[i]))
sample_mean = []
for i in range (0, 50):
group1 = st.mean(sample[i])
sample_mean.append(group1)
# for i in range (0, 50):
# print("Mean of Sample %s is : \n %s" %(i+1 ,sample_mean[i]))
plt.hist(sample_mean,50)
plt.title("Histogram plot for Sample Means")
plt.xlabel("Samples")
plt.ylabel("Frequency")
plt.show()
total_sample_mean = st.mean(sample_mean)
total_sample_std_dev = st.stdev(sample_mean)
print(total_sample_mean)
print(total_sample_std_dev)
121.032060117 2.676564015622219
sample=[]
for i in range(0, 10):
group = random.sample(data,100)
sample.append(group)
# for i in range (0, 10):
# print("Sample %s is : \n %s" %(i+1 ,sample[i]))
sample_mean = []
for i in range (0, 10):
group1 = st.mean(sample[i])
sample_mean.append(group1)
for i in range (0, 10):
print("Mean of Sample %s is : \n %s" %(i+1 ,sample_mean[i]))
plt.hist(sample_mean,10)
plt.title("Histogram plot for Sample Means")
plt.xlabel("Samples")
plt.ylabel("Frequency")
plt.show()
Mean of Sample 1 is : 119.3641999 Mean of Sample 2 is : 123.48439985 Mean of Sample 3 is : 120.32880047 Mean of Sample 4 is : 123.16389991 Mean of Sample 5 is : 120.40849959 Mean of Sample 6 is : 119.81640007 Mean of Sample 7 is : 120.37850014 Mean of Sample 8 is : 118.55520004 Mean of Sample 9 is : 120.02310005 Mean of Sample 10 is : 122.47930006
total_sample_mean = st.mean(sample_mean)
total_sample_std_dev = st.stdev(sample_mean)
print(total_sample_mean)
print(total_sample_std_dev)
120.800230008 1.661282247205508
The sampling distribution of sample means approaches the normalistribution as thesample size gets larger.It is true especially for sample size>30. Therefore, as the sample size in 1oth part is 100 that is large as compared to other sample sizes and so it is more normal then other.
Hence, large sample size (as per this project sample size of 100) is consistent with the Central Limit Theorem.
n=100
confidence = 0.95
alpha = 1 - confidence
print(alpha)
0.050000000000000044
# Consider Sample 6 for analysis
x = st.mean(sample[5])
total = st.stdev(sample[5])
x
119.81640007
z_criti =stats.norm.ppf(q=0.975)
z_int = stats.norm.interval(alpha = confidence)
print(z_int)
(-1.959963984540054, 1.959963984540054)
import math
stand_error = total / math.sqrt(n)
print(stand_error)
1.2056010568106443
CI_lowerlimit = x - z_criti * stand_error
CI_upperlimit = x + z_criti * stand_error
CI_lowerlimit, CI_upperlimit
(117.4534654189277, 122.1793347210723)
n=20
confidence = 0.95
alpha = 1 - confidence
x = st.mean(sample[6])
total = st.stdev(sample[6])
z_criti =stats.norm.ppf(q=0.975)
z_int = stats.norm.interval(alpha = confidence)
stand_error = total / math.sqrt(n)
CI_lowerlimit = x - z_criti * stand_error
CI_upperlimit = x + z_criti * stand_error
CI_lowerlimit, CI_upperlimit
(115.01015294344866, 125.74684733655134)
n=20
confidence = 0.95
alpha = 1 - confidence
x = st.mean(df['Close_ETF'])
total = st.stdev(df['Close_ETF'])
z_criti =stats.norm.ppf(q=0.975)
z_int = stats.norm.interval(alpha = confidence)
stand_error = total / math.sqrt(n)
CI_lowerlimit = x - z_criti * stand_error
CI_upperlimit = x + z_criti * stand_error
CI_lowerlimit, CI_upperlimit
(115.64410774211862, 126.66181228188137)
import scipy.stats
x = st.mean(sample[5])
u = st.mean(data)
s = st.stdev(data)
n = len(sample[5])
alpha = 0.05
z_test = (x-u)/(s/math.sqrt(n))
z_critical1 = scipy.stats.norm.ppf(alpha/2)
z_critical2 = scipy.stats.norm.ppf(1-alpha/2)
print("z critical 1 =", z_critical1)
print("z critical 2 =", z_critical2)
print("z_test =", z_test)
if(z_test>z_critical1 or z_test<z_critical2):
print("Reject H0")
else:
print("Accept H0")
z critical 1 = -1.9599639845400545 z critical 2 = 1.959963984540054 z_test = -1.063311247607618 Reject H0
# u = 100
# z_score = (x - u)/stand_error
# p_value = scipy.stats.norm.sf(abs(z_score))
# print('P-value =',(p_value))
# if p_value<alpha :
# print('p_value is less than alpha. Therefore we reject the Null hypothesis')
# else:
# print('p_value is greater than alpha. We do not reject Null hypothesis')
x = st.mean(sample[6])
u = st.mean(data)
s = st.stdev(data)
n = len(sample[6])
alpha = 0.05
z_test = (x-u)/(s/math.sqrt(n))
z_critical1 = scipy.stats.norm.ppf(alpha/2)
z_critical2 = scipy.stats.norm.ppf(1-alpha/2)
print("z critical 1 =", z_critical1)
print("z critical 2 =", z_critical2)
print("z_test =", z_test)
if(z_test>z_critical1 or z_test<z_critical2):
print("Reject H0")
else:
print("Accept H0")
z critical 1 = -1.9599639845400545 z critical 2 = 1.959963984540054 z_test = -0.616127916781718 Reject H0
from scipy.stats import chi2
from scipy.stats import chi2_contingency
n = len(sample[5])
s = st.stdev(sample[5])
sigma = 15
alpha = 0.05
dof = n-1
test_stat = (n-1)/(s/sigma)
critical_val = chi2.ppf(alpha/2, dof)
critical_val = chi2.ppf(1-alpha/2, dof)
print("test_statisics: ", test_stat)
print("critical_val: ", critical_val)
if(test_stat<critical_val or test_stat>critical_val):
print("Reject H0")
else:
print("Accept H0")
test_statisics: 123.17507450835281 critical_val: 128.4219886438403 Reject H0
test_stat = (n-1)/(s/sigma)
critical_val = chi2.ppf(1-alpha, dof)
print("test_statisics: ", test_stat)
print("critical_val: ", critical_val)
if(test_stat<critical_val):
print("Reject H0")
else:
print("Accept H0")
test_statisics: 123.17507450835281 critical_val: 123.2252214533618 Reject H0
from statsmodels.stats import weightstats
sample_gold = df['gold'].tolist()
sample_oil = df['gold'].tolist()
zstat,p= weightstats.ztest(sample_gold, sample_oil, alternative='two-sided',usevar='pooled',value=0)
if(p<0.05):
print("Reject H0")
else:
print("Accept H0")
Accept H0
from scipy.stats import ttest_1samp,f,t
random_sample = []
n = len(sample_gold)
for i in range(n):
random_sample.append(sample_gold[i]-sample_oil[i])
tstati, p = ttest_1samp(random_sample,0)
if(p<0.05):
print("Reject H0")
else:
print("Accept H0")
Accept H0
fstat = (st.stdev(sample_gold)/st.stdev(sample_oil))
p_value = f.sf(fstat,999,999)
if(p_value<0.05):
print("Reject H0")
else:
print("Accept H0")
Accept H0
x = df['Close_ETF']
y = df['gold']
plt.scatter(y,x)
plt.show()
correlation = df['Close_ETF'].corr(df['gold'])
print(correlation)
0.022995570076054603
y = df['Close_ETF']
x = df['gold']
plt.scatter(x,y, label='Scatter Plot')
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
plt.plot(x, y, 'o', label='Original data', markersize=10)
plt.plot(x, m*x + c, 'r', label='Fitted line')
plt.legend()
plt.show()
m, c
(25.604389324427057, 121.13598849889821)
import math
x = df['Close_ETF']
y = df['gold']
mean_x = np.mean(x)
n = len(y)
ssxx = sum(x*x) - (sum(x)*sum(x)/n)
ssyy = sum(y*y) - (sum(y)*sum(y)/n)
ssxy = sum(x*y) - (sum(x)*sum(y)/n)
b = ssxx/ssxy
s = np.sqrt((ssxx-(b*ssyy))/(n-2))
seb = s/np.sqrt(ssyy)
test_stat = m/seb
p_value = t.sf(test_stat,999)
if(p_value<0.02):
print("Reject H0")
else:
print("Accept H0")
Accept H0
from sklearn.metrics import r2_score
x = df['Close_ETF']
y = df['gold']
r2 = r2_score(x,y)
r2
-92.99113815628421
CI_lower = b - (t*seb)
CI_upper = b + (t*seb)
CI_lower, CI_upper
Consider the data including the ETF, Gold and Oil column. Using any software, fit a multiple linear regression model to the data with the ETF variable as the response. Evaluate your model with adjusted 𝑅2.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
y = df['Close_ETF']
x = df[['gold', 'oil']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)
mlr = LinearRegression()
mlr.fit(x_train, y_train)
print("Intercept: ", mlr.intercept_)
print("Coefficients:",list(zip(x, mlr.coef_)))
adj_r2 = 1 - (1-mlr.score(x_train, y_train))*(len(y_train)-1)/(len(y_train)-x_train.shape[1]-1)
print("adjusterd r2:", adj_r2)
Calculate the residuals of the model fitting you did in Part 9. Check the four assumptions made for the error terms of the multiple regression model using these residuals (mean 0; constant variance; normality; and the independence). You may draw some plots over the residuals to check these assumptions. For example, draw a Normal Probability Plot to check the normality assumption; draw a scatter plot of Residuals vs. Fitted Values to check the constant variance assumption and the independence assumption; and so on. You may refer to the following link https://www.youtube.com/watch?v=4zQkJw73U6I for some hints. In your project report, all the relevant plots and at least one paragraph of summary of checking the four assumptions using those plots must be included. Discuss how you may improve the quality of your regression model according to the strategy of model selection.
# import statsmodels.api as sm
# mlr = sm.OLS(y, x).fit()
# influence = mlr.get_influence()
# standardized_residuals = influence.resid_studentized_internal
# #print(standardized_residuals)
residuals = df['Close_ETF'] - result.fittedvalues
x = df.iloc[ : , 1:3] # Splicing to Only gold and oil
y = df.iloc[ : , 0] # ETF
x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()
plt.scatter(residuals, result.fittedvalues)
plt.xlabel('Residuals')
plt.ylabel('Fitted Values')
plt.title('Residuals vs Fitted Values')
Text(0.5, 1.0, 'Scatter Plot: Residuals vs Fitted Values')
plt.scatter(df['Close_ETF'], result.fittedvalues)
<matplotlib.collections.PathCollection at 0x7fc8505acbe0>